suppressPackageStartupMessages(suppressWarnings({
    library(tidyverse)
  library(data.table)
  library(knitr)
  library(gridExtra)
}))
opts_chunk$set(cache=TRUE, echo=TRUE, message=FALSE, warning=FALSE)
load("../results/wc_sims.rdata")

What fraction of discovered SNPs will be significantly overestimated for a given minimum rsq value?

ggplot(l, aes(x=nid, y=nsig_overestimate / nsig)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_x_log10() +
  ylim(c(0,1)) +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ .)

And what fraction will be weak?

p0 <- l %>%
  mutate(h2 = paste0("h2 = ", h2), S=paste0("S = ", S)) %>%
ggplot(., aes(x=nid, y=nsig_overestimate / nsig)) +
  geom_point(aes(colour=as.factor(nsnp)), size=0.4) +
  geom_smooth(aes(colour=as.factor(nsnp)), se=FALSE) +
  scale_colour_brewer(type="seq", palette=2) +
  facet_grid(h2 ~ S) +
  ylim(c(0,1)) +
  labs(x="Sample size", y="Proportion of discovered effects overestimated", colour="Number of causal variants") +
  theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5), legend.position="bottom")
p0

Relationship between overestimates and weak? As power gets better more of the overestimates are still strong

ggplot(l, aes(x=nsig_overestimate, y=nsig_weak)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_x_log10() +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)

How many just overestimated

ggplot(l, aes(x=nid, y=nsig_overestimate)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_x_log10() +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)

ggplot(l, aes(x=nid, y=nsig_overestimate/nsig)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_x_log10() +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)

ggplot(l, aes(x=nid, y=min_rsq)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)

ggplot(l, aes(x=nid, y=nsig)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)

ggplot(l, aes(x=nsig, y=nsig_overestimate/nsig)) +
geom_point(aes(colour=as.factor(nsnp))) +
geom_smooth(aes(colour=as.factor(nsnp))) +
scale_colour_brewer(type="qual") +
facet_grid(h2 ~ S)


Sample split GWAS

load("../results/instrument_list.rdata")
counts <- fread("../results/count_tophits.txt")
instrument_counts <- subset(instrument_counts, !is.na(BETA.disc))
weak_p <- pf(10, 1, 100000, low=FALSE)

Quick summary

s1 <- instrument_counts %>%
    group_by(id) %>%
    summarise(
        ndisc=sum(P_BOLT_LMM_INF.disc < 5e-8),
        nrepl=sum(P_BOLT_LMM_INF.repl < 5e-8),
        nhalf=mean(c(ndisc, nrepl)),
        ntot=sum(P_BOLT_LMM_INF.disc < 5e-8 | P_BOLT_LMM_INF.repl < 5e-8),
        ndisc_weak=sum(P_BOLT_LMM_INF.disc < 5e-8 & P_BOLT_LMM_INF.repl > weak_p),
        nrepl_weak=sum(P_BOLT_LMM_INF.repl < 5e-8 & P_BOLT_LMM_INF.disc > weak_p),
        ndisc_overestimated=sum(P_BOLT_LMM_INF.disc < 5e-8 & (abs(BETA.disc) - 1.96 * SE.disc) > abs(BETA.repl)),
        nrepl_overestimated=sum(P_BOLT_LMM_INF.repl < 5e-8 & (abs(BETA.repl) - 1.96 * SE.repl) > abs(BETA.disc)),
    nhalf_overestimated=mean(c(ndisc_overestimated, nrepl_overestimated)),
        ndisc_underestimated=sum(P_BOLT_LMM_INF.disc < 5e-8 & (abs(BETA.disc) + 1.96 * SE.disc) < abs(BETA.repl)),
        nrepl_underestimated=sum(P_BOLT_LMM_INF.repl < 5e-8 & (abs(BETA.repl) + 1.96 * SE.repl) < abs(BETA.disc)),
    nhalf_underestimated=mean(c(ndisc_underestimated, nrepl_underestimated)),
        nhalf_weak=mean(c(ndisc_weak, nrepl_weak))
    )

Number of traits with an instrument

sum(s1$nhalf > 0)
## [1] 681

Total number of instruments

sum(s1$nhalf)
## [1] 13698
sum(s1$ndisc)
## [1] 13673
sum(s1$nrepl)
## [1] 13723
sum(s1$ntot)
## [1] 18482

how many instruments per trait

summary(s1$ndisc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    1.00   12.13    4.00  477.00
summary(s1$nrepl)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    1.00   12.18    3.00  460.00
summary(s1$ndisc[s1$ndisc != 0])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.00    3.00   23.17   11.00  477.00
summary(s1$nrepl[s1$nrepl != 0])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.00    3.00   23.54   11.00  460.00

Number of weak instruments

sum(s1$nhalf_weak)
## [1] 681
sum(s1$ndisc_weak)
## [1] 734
sum(s1$nrepl_weak)
## [1] 628

Number of overestimated instruments

sum(s1$nhalf_overestimated)
## [1] 2589
sum(s1$ndisc_overestimated)
## [1] 2623
sum(s1$nrepl_overestimated)
## [1] 2555

Overestimated instruments

summary(s1$nhalf_overestimated[s1$nhalf != 0])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.500   0.500   3.802   2.500  60.000
summary(s1$ndisc_overestimated[s1$ndisc != 0])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   4.446   3.000  56.000
summary(s1$nrepl_overestimated[s1$nrepl != 0])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   4.383   4.000  64.000
summary(s1$nhalf_overestimated/s1$nhalf)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.1163  0.2302  0.3548  0.5000  1.0000     446
summary(s1$ndisc_overestimated/s1$ndisc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.2000  0.3098  0.5000  1.0000     537
summary(s1$nrepl_overestimated/s1$nrepl)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.1875  0.2978  0.5000  1.0000     544
hist(s1$nhalf_overestimated / s1$nhalf, breaks=100)

hist(s1$nhalf_weak / s1$nhalf, breaks=100)

ggplot(s1, aes(x=ndisc, y=ndisc_overestimated / ndisc)) +
  geom_point()

ggplot(s1, aes(x=ndisc, y=ndisc_weak / ndisc)) +
  geom_point()

p1 <- bind_rows(
  l %>% dplyr::select(nsig=nsig, nsnp=nsnp, nsig_overestimate) %>% mutate(what="Simulations"),
  s1 %>% dplyr::select(nsig=ndisc, nsig_overestimate=ndisc_overestimated) %>% mutate(what="UK Biobank")
) %>%
  {
ggplot(., aes(x=nsig, y=nsig_overestimate/nsig)) +
  geom_point(data=subset(., what !="UK Biobank"), aes(colour=as.factor(nsnp)), size=0.1, alpha=0.5) +
  geom_smooth(data=subset(., what !="UK Biobank"), aes(colour=as.factor(nsnp)), se=FALSE) +
  geom_point(data=subset(., what =="UK Biobank")) +
  scale_colour_brewer(type="seq", palette=2) +
  # scale_x_log10() +
      xlim(c(0,500)) +
  labs(x="Discovered associations", y="Proportion overestimated", colour="Number of causal variants", shape="")
    
  }
p1

s1 %>% 
  mutate(propover=ndisc_overestimated/ndisc, propweak=ndisc_weak/ndisc) %>%
  dplyr::select(id, propover, propweak) %>%
  pivot_longer(cols=c(propover, propweak)) %>%
  ggplot(., aes(x=value)) +
  geom_histogram(aes(fill=name), position="dodge", bins=100)

  # geom_density(aes(fill=name), alpha=0.2)
ggplot(s1, aes(x=ndisc_overestimated/ndisc)) +
  geom_histogram(bins=100)

Figure

g_legend<-function(a.gplot)
{
    tmp <- ggplot_gtable(ggplot_build(a.gplot))
    leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
    legend <- tmp$grobs[[leg]]
    return(legend)
}
mylegend <- g_legend(p0 + theme(legend.position="bottom"))
p4 <- grid.arrange(
    mylegend, arrangeGrob(p0 + theme(legend.position="none") + labs(title="A"),
                p1 + theme(legend.position="none") + labs(title="B", y=""),
                nrow=2),
    nrow=2, heights=c(1, 10)
)